Supervised Learning-II#

Linear Regression#

  • Task: Regression

Simple Linear Regression

  • Input has only one variable

  • Model: y=ax+b

  • Closest line to the data points.

  • Sum of squares of the vertical lengths (residuals) are added together and minimized

Multiple Linear Regression

  • Input has more than one variables

  • Model: If there are two variables \(x_1\) and \(x_2\)

    • \(y=a_1x_1+a_2x_2+b\)

  • Closest line to the data points.

import numpy as np
import matplotlib.pyplot as plt

x = np.linspace(0,1,20)
noise = np.random.randn(20)/3
y = 2*x+3+noise

x_train = np.array([x[i] for i in [1,4,6,14,18]])
y_train = np.array([y[i] for i in [1,4,6,14,18]])

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(x_train.reshape(-1,1),y_train)
y_l = lin_reg.predict(x_train.reshape(-1,1))

for i in range(x_train.shape[0]):
  plt.plot([x_train[i],x_train[i]], [y_train[i], y_l[i]], 'r--')

plt.scatter(x_train,y_train, label='training', c='b')
plt.plot(x_train,y_l,label= 'linear model', c='orange')
plt.title('Linear Model and Residuals', fontsize=20)
plt.legend();
_images/27bffa0c0fbcc8766050346848f7b394b720eb952ed7a095b7d956af87dc7a4e.png

Simple Linear Regression#

# Use Boston housing data
# Use LSTAT to predict price
from sklearn.datasets import fetch_california_housing
dataset = fetch_california_housing()
dataset.keys()
dict_keys(['data', 'target', 'frame', 'target_names', 'feature_names', 'DESCR'])
# feature names
dataset.feature_names
['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']
# X,y
X = dataset.data[:,0]
y = dataset.target
# X,y shapes
X.shape, y.shape
((20640,), (20640,))
# scatter plot of MedInc vs MEDV
import matplotlib.pyplot as plt
plt.scatter(X,y)
plt.xlabel('MedInc')
plt.ylabel('MEDV')
plt.title('California Housing Price');
_images/315793decd524f05b987c841e6e807f0f7bdd78cdaec3396a6363d55937ef4bc.png
# train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# plot training and test set
plt.figure(figsize=(10,10))
plt.scatter(X_train,y_train, label='Training Set', c='blue')
plt.scatter(X_test,y_test, label='Test Set', c='r')

plt.xlabel('MedInc')
plt.ylabel('MDEV')
plt.title('California Housing Price')
plt.legend();
_images/a400cffe1eb37aa6c40a5cd1089e6e1313bc1d4b2d50ff3a387c18bb9794051d.png
# use plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots()

fig.add_trace(go.Scatter(x=X_train, y=y_train, mode='markers',  marker=dict(color='blue'), name="training")  )
fig.add_trace(go.Scatter(x=X_test, y=y_test, mode='markers',marker=dict(color='red'),  name="test")  )


fig.update_layout(title_text="Training and Test Sets for California Housing Data")
# fit model
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg.fit(X_train.reshape(-1,1),y_train)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# intercept
b = lin_reg.intercept_
b
0.44879836696807707
# ceoefficient
m = lin_reg.coef_
m
array([0.41731856])
# line x and y
import numpy as np
x_lin = np.linspace(0,40,100)
y_lin = m*x_lin+b
# use plotly to plot training, test, linear model
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots()

fig.add_trace(go.Scatter(x=X_train, y=y_train, mode='markers',  marker=dict(color='blue'), name="training")  )
fig.add_trace(go.Scatter(x=X_test, y=y_test, mode='markers',marker=dict(color='red'),  name="test")  )
fig.add_trace(go.Scatter(x=x_lin, y=y_lin ,marker=dict(color='green'),  name="Linear Model")  )


fig.update_layout(title_text="Linear  Model")
# training score
lin_reg.score(X_train.reshape(-1,1), y_train)
0.4738509942209922
# test score
lin_reg.score(X_test.reshape(-1,1), y_test)
0.4725720683367075
#Restrict the data to y<40, y_r
y_r = y[y < 40]
y_r.shape
(20640,)
# X_r
X_r = X[y < 40]
X_r.shape
(20640,)
# train test split
Xr_train, Xr_test, yr_train, yr_test = train_test_split(X_r, y_r, test_size=0.33, random_state=42)
# fit the model
lin_reg = LinearRegression()
lin_reg.fit(Xr_train.reshape(-1,1),yr_train)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# coefficient
m = lin_reg.coef_
m
array([0.41731856])
# intercept
b = lin_reg.intercept_
b
0.44879836696807707
# line x and y
xr_lin = np.linspace(0,40,100)
yr_lin = m*xr_lin+b
# # plotly restricted train, test, linear model
import plotly.graph_objects as go
from plotly.subplots import make_subplots

fig = make_subplots()

fig.add_trace(go.Scatter(x=Xr_train, y=yr_train, mode='markers',  marker=dict(color='blue'), name="training")  )
fig.add_trace(go.Scatter(x=Xr_test, y=yr_test, mode='markers',marker=dict(color='red'),  name="test")  )
fig.add_trace(go.Scatter(x=xr_lin, y=yr_lin ,marker=dict(color='green'),  name="Linear Model")  )


fig.update_layout(title_text="Linear  Model")
# training score
lin_reg.score(Xr_train.reshape(-1,1), yr_train)
0.4738509942209922
# test score
lin_reg.score(Xr_test.reshape(-1,1), yr_test)
0.4725720683367075

Multiple Linear Regression#

X = dataset.data
y = dataset.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
# fit the model
lin_reg = LinearRegression()
lin_reg.fit(X_train,y_train)
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# intercept
b = lin_reg.intercept_
b
-37.08201093908004
# coefficients
m = lin_reg.coef_
m
array([ 4.44870466e-01,  9.55004561e-03, -1.21991503e-01,  7.79144696e-01,
       -7.68990809e-08, -3.29948505e-03, -4.19131153e-01, -4.34103468e-01])
# coefficient shape
m.shape
(8,)
# training score
lin_reg.score(X_train  , y_train )
0.609370412027382
# test score
lin_reg.score(X_test  , y_test )
0.597049412878397
# actual vs predicted scatter plot
plt.scatter(y_test ,lin_reg.predict(X_test  ))
plt.plot([0,10],[0,10], 'r--')
plt.title('Comparison of Actual and Predicted Values')
plt.xlabel('actual')
plt.ylabel('predicted');
_images/3fe005461e6ab03a6542e101af064cc321fc83d081447a10d436c6f2ffb7f696.png
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset.feature_names, lin_reg.coef_);
_images/ceb95cb8753c0b0323f74e365e37e81bf49d5fc0681ab5bb148e7d7cc0066d21.png
sorted = np.argsort(lin_reg.coef_)
sorted
array([7, 6, 2, 5, 4, 1, 0, 3])
dataset.feature_names
['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']
np.array(dataset.feature_names)[sorted]
array(['Longitude', 'Latitude', 'AveRooms', 'AveOccup', 'Population',
       'HouseAge', 'MedInc', 'AveBedrms'], dtype='<U10')
lin_reg.coef_[sorted]
array([-4.34103468e-01, -4.19131153e-01, -1.21991503e-01, -3.29948505e-03,
       -7.68990809e-08,  9.55004561e-03,  4.44870466e-01,  7.79144696e-01])
# sorted coefficients in a bar graph

plt.figure(figsize=(10,5))
plt.bar(np.array(dataset.feature_names)[sorted], lin_reg.coef_[sorted]);
_images/123f5f0ec60fd57ca5a58ff92edbafc103f1a4917662bbdd10e9ffa9b73be945.png

Desicion Tree#

  • consists of a hierarchy of if/else questions

  • predict the value of a target variable by answering these if/else questions

  • find the smallest tree that fits the data.

  • for regression tasks questions are less than a number or not.

Slide5.JPG

Gini Impurity#

Start with the question which minimizes
\(G = 1- \sum p_i^2\)

Desicion Tree Classifier#

# instantiate the class into an object
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
# help
help(dtc)
Help on DecisionTreeClassifier in module sklearn.tree._classes object:

class DecisionTreeClassifier(sklearn.base.ClassifierMixin, BaseDecisionTree)
 |  DecisionTreeClassifier(*, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
 |  
 |  A decision tree classifier.
 |  
 |  Read more in the :ref:`User Guide <tree>`.
 |  
 |  Parameters
 |  ----------
 |  criterion : {"gini", "entropy", "log_loss"}, default="gini"
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "log_loss" and "entropy" both for the
 |      Shannon information gain, see :ref:`tree_mathematical_formulation`.
 |  
 |  splitter : {"best", "random"}, default="best"
 |      The strategy used to choose the split at each node. Supported
 |      strategies are "best" to choose the best split and "random" to choose
 |      the best random split.
 |  
 |  max_depth : int, default=None
 |      The maximum depth of the tree. If None, then nodes are expanded until
 |      all leaves are pure or until all leaves contain less than
 |      min_samples_split samples.
 |  
 |  min_samples_split : int or float, default=2
 |      The minimum number of samples required to split an internal node:
 |  
 |      - If int, then consider `min_samples_split` as the minimum number.
 |      - If float, then `min_samples_split` is a fraction and
 |        `ceil(min_samples_split * n_samples)` are the minimum
 |        number of samples for each split.
 |  
 |      .. versionchanged:: 0.18
 |         Added float values for fractions.
 |  
 |  min_samples_leaf : int or float, default=1
 |      The minimum number of samples required to be at a leaf node.
 |      A split point at any depth will only be considered if it leaves at
 |      least ``min_samples_leaf`` training samples in each of the left and
 |      right branches.  This may have the effect of smoothing the model,
 |      especially in regression.
 |  
 |      - If int, then consider `min_samples_leaf` as the minimum number.
 |      - If float, then `min_samples_leaf` is a fraction and
 |        `ceil(min_samples_leaf * n_samples)` are the minimum
 |        number of samples for each node.
 |  
 |      .. versionchanged:: 0.18
 |         Added float values for fractions.
 |  
 |  min_weight_fraction_leaf : float, default=0.0
 |      The minimum weighted fraction of the sum total of weights (of all
 |      the input samples) required to be at a leaf node. Samples have
 |      equal weight when sample_weight is not provided.
 |  
 |  max_features : int, float or {"auto", "sqrt", "log2"}, default=None
 |      The number of features to consider when looking for the best split:
 |  
 |          - If int, then consider `max_features` features at each split.
 |          - If float, then `max_features` is a fraction and
 |            `max(1, int(max_features * n_features_in_))` features are considered at
 |            each split.
 |          - If "auto", then `max_features=sqrt(n_features)`.
 |          - If "sqrt", then `max_features=sqrt(n_features)`.
 |          - If "log2", then `max_features=log2(n_features)`.
 |          - If None, then `max_features=n_features`.
 |  
 |          .. deprecated:: 1.1
 |              The `"auto"` option was deprecated in 1.1 and will be removed
 |              in 1.3.
 |  
 |      Note: the search for a split does not stop until at least one
 |      valid partition of the node samples is found, even if it requires to
 |      effectively inspect more than ``max_features`` features.
 |  
 |  random_state : int, RandomState instance or None, default=None
 |      Controls the randomness of the estimator. The features are always
 |      randomly permuted at each split, even if ``splitter`` is set to
 |      ``"best"``. When ``max_features < n_features``, the algorithm will
 |      select ``max_features`` at random at each split before finding the best
 |      split among them. But the best found split may vary across different
 |      runs, even if ``max_features=n_features``. That is the case, if the
 |      improvement of the criterion is identical for several splits and one
 |      split has to be selected at random. To obtain a deterministic behaviour
 |      during fitting, ``random_state`` has to be fixed to an integer.
 |      See :term:`Glossary <random_state>` for details.
 |  
 |  max_leaf_nodes : int, default=None
 |      Grow a tree with ``max_leaf_nodes`` in best-first fashion.
 |      Best nodes are defined as relative reduction in impurity.
 |      If None then unlimited number of leaf nodes.
 |  
 |  min_impurity_decrease : float, default=0.0
 |      A node will be split if this split induces a decrease of the impurity
 |      greater than or equal to this value.
 |  
 |      The weighted impurity decrease equation is the following::
 |  
 |          N_t / N * (impurity - N_t_R / N_t * right_impurity
 |                              - N_t_L / N_t * left_impurity)
 |  
 |      where ``N`` is the total number of samples, ``N_t`` is the number of
 |      samples at the current node, ``N_t_L`` is the number of samples in the
 |      left child, and ``N_t_R`` is the number of samples in the right child.
 |  
 |      ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
 |      if ``sample_weight`` is passed.
 |  
 |      .. versionadded:: 0.19
 |  
 |  class_weight : dict, list of dict or "balanced", default=None
 |      Weights associated with classes in the form ``{class_label: weight}``.
 |      If None, all classes are supposed to have weight one. For
 |      multi-output problems, a list of dicts can be provided in the same
 |      order as the columns of y.
 |  
 |      Note that for multioutput (including multilabel) weights should be
 |      defined for each class of every column in its own dict. For example,
 |      for four-class multilabel classification weights should be
 |      [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
 |      [{1:1}, {2:5}, {3:1}, {4:1}].
 |  
 |      The "balanced" mode uses the values of y to automatically adjust
 |      weights inversely proportional to class frequencies in the input data
 |      as ``n_samples / (n_classes * np.bincount(y))``
 |  
 |      For multi-output, the weights of each column of y will be multiplied.
 |  
 |      Note that these weights will be multiplied with sample_weight (passed
 |      through the fit method) if sample_weight is specified.
 |  
 |  ccp_alpha : non-negative float, default=0.0
 |      Complexity parameter used for Minimal Cost-Complexity Pruning. The
 |      subtree with the largest cost complexity that is smaller than
 |      ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
 |      :ref:`minimal_cost_complexity_pruning` for details.
 |  
 |      .. versionadded:: 0.22
 |  
 |  Attributes
 |  ----------
 |  classes_ : ndarray of shape (n_classes,) or list of ndarray
 |      The classes labels (single output problem),
 |      or a list of arrays of class labels (multi-output problem).
 |  
 |  feature_importances_ : ndarray of shape (n_features,)
 |      The impurity-based feature importances.
 |      The higher, the more important the feature.
 |      The importance of a feature is computed as the (normalized)
 |      total reduction of the criterion brought by that feature.  It is also
 |      known as the Gini importance [4]_.
 |  
 |      Warning: impurity-based feature importances can be misleading for
 |      high cardinality features (many unique values). See
 |      :func:`sklearn.inspection.permutation_importance` as an alternative.
 |  
 |  max_features_ : int
 |      The inferred value of max_features.
 |  
 |  n_classes_ : int or list of int
 |      The number of classes (for single output problems),
 |      or a list containing the number of classes for each
 |      output (for multi-output problems).
 |  
 |  n_features_in_ : int
 |      Number of features seen during :term:`fit`.
 |  
 |      .. versionadded:: 0.24
 |  
 |  feature_names_in_ : ndarray of shape (`n_features_in_`,)
 |      Names of features seen during :term:`fit`. Defined only when `X`
 |      has feature names that are all strings.
 |  
 |      .. versionadded:: 1.0
 |  
 |  n_outputs_ : int
 |      The number of outputs when ``fit`` is performed.
 |  
 |  tree_ : Tree instance
 |      The underlying Tree object. Please refer to
 |      ``help(sklearn.tree._tree.Tree)`` for attributes of Tree object and
 |      :ref:`sphx_glr_auto_examples_tree_plot_unveil_tree_structure.py`
 |      for basic usage of these attributes.
 |  
 |  See Also
 |  --------
 |  DecisionTreeRegressor : A decision tree regressor.
 |  
 |  Notes
 |  -----
 |  The default values for the parameters controlling the size of the trees
 |  (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
 |  unpruned trees which can potentially be very large on some data sets. To
 |  reduce memory consumption, the complexity and size of the trees should be
 |  controlled by setting those parameter values.
 |  
 |  The :meth:`predict` method operates using the :func:`numpy.argmax`
 |  function on the outputs of :meth:`predict_proba`. This means that in
 |  case the highest predicted probabilities are tied, the classifier will
 |  predict the tied class with the lowest index in :term:`classes_`.
 |  
 |  References
 |  ----------
 |  
 |  .. [1] https://en.wikipedia.org/wiki/Decision_tree_learning
 |  
 |  .. [2] L. Breiman, J. Friedman, R. Olshen, and C. Stone, "Classification
 |         and Regression Trees", Wadsworth, Belmont, CA, 1984.
 |  
 |  .. [3] T. Hastie, R. Tibshirani and J. Friedman. "Elements of Statistical
 |         Learning", Springer, 2009.
 |  
 |  .. [4] L. Breiman, and A. Cutler, "Random Forests",
 |         https://www.stat.berkeley.edu/~breiman/RandomForests/cc_home.htm
 |  
 |  Examples
 |  --------
 |  >>> from sklearn.datasets import load_iris
 |  >>> from sklearn.model_selection import cross_val_score
 |  >>> from sklearn.tree import DecisionTreeClassifier
 |  >>> clf = DecisionTreeClassifier(random_state=0)
 |  >>> iris = load_iris()
 |  >>> cross_val_score(clf, iris.data, iris.target, cv=10)
 |  ...                             # doctest: +SKIP
 |  ...
 |  array([ 1.     ,  0.93...,  0.86...,  0.93...,  0.93...,
 |          0.93...,  0.93...,  1.     ,  0.93...,  1.      ])
 |  
 |  Method resolution order:
 |      DecisionTreeClassifier
 |      sklearn.base.ClassifierMixin
 |      BaseDecisionTree
 |      sklearn.base.MultiOutputMixin
 |      sklearn.base.BaseEstimator
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, *, criterion='gini', splitter='best', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, min_impurity_decrease=0.0, class_weight=None, ccp_alpha=0.0)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  fit(self, X, y, sample_weight=None, check_input=True)
 |      Build a decision tree classifier from the training set (X, y).
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The training input samples. Internally, it will be converted to
 |          ``dtype=np.float32`` and if a sparse matrix is provided
 |          to a sparse ``csc_matrix``.
 |      
 |      y : array-like of shape (n_samples,) or (n_samples, n_outputs)
 |          The target values (class labels) as integers or strings.
 |      
 |      sample_weight : array-like of shape (n_samples,), default=None
 |          Sample weights. If None, then samples are equally weighted. Splits
 |          that would create child nodes with net zero or negative weight are
 |          ignored while searching for a split in each node. Splits are also
 |          ignored if they would result in any single class carrying a
 |          negative weight in either child node.
 |      
 |      check_input : bool, default=True
 |          Allow to bypass several input checking.
 |          Don't use this parameter unless you know what you're doing.
 |      
 |      Returns
 |      -------
 |      self : DecisionTreeClassifier
 |          Fitted estimator.
 |  
 |  predict_log_proba(self, X)
 |      Predict class log-probabilities of the input samples X.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, it will be converted to
 |          ``dtype=np.float32`` and if a sparse matrix is provided
 |          to a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      proba : ndarray of shape (n_samples, n_classes) or list of n_outputs             such arrays if n_outputs > 1
 |          The class log-probabilities of the input samples. The order of the
 |          classes corresponds to that in the attribute :term:`classes_`.
 |  
 |  predict_proba(self, X, check_input=True)
 |      Predict class probabilities of the input samples X.
 |      
 |      The predicted class probability is the fraction of samples of the same
 |      class in a leaf.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, it will be converted to
 |          ``dtype=np.float32`` and if a sparse matrix is provided
 |          to a sparse ``csr_matrix``.
 |      
 |      check_input : bool, default=True
 |          Allow to bypass several input checking.
 |          Don't use this parameter unless you know what you're doing.
 |      
 |      Returns
 |      -------
 |      proba : ndarray of shape (n_samples, n_classes) or list of n_outputs             such arrays if n_outputs > 1
 |          The class probabilities of the input samples. The order of the
 |          classes corresponds to that in the attribute :term:`classes_`.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmethods__ = frozenset()
 |  
 |  __annotations__ = {'_parameter_constraints': <class 'dict'>}
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.ClassifierMixin:
 |  
 |  score(self, X, y, sample_weight=None)
 |      Return the mean accuracy on the given test data and labels.
 |      
 |      In multi-label classification, this is the subset accuracy
 |      which is a harsh metric since you require for each sample that
 |      each label set be correctly predicted.
 |      
 |      Parameters
 |      ----------
 |      X : array-like of shape (n_samples, n_features)
 |          Test samples.
 |      
 |      y : array-like of shape (n_samples,) or (n_samples, n_outputs)
 |          True labels for `X`.
 |      
 |      sample_weight : array-like of shape (n_samples,), default=None
 |          Sample weights.
 |      
 |      Returns
 |      -------
 |      score : float
 |          Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from sklearn.base.ClassifierMixin:
 |  
 |  __dict__
 |      dictionary for instance variables
 |  
 |  __weakref__
 |      list of weak references to the object
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from BaseDecisionTree:
 |  
 |  apply(self, X, check_input=True)
 |      Return the index of the leaf that each sample is predicted as.
 |      
 |      .. versionadded:: 0.17
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, it will be converted to
 |          ``dtype=np.float32`` and if a sparse matrix is provided
 |          to a sparse ``csr_matrix``.
 |      
 |      check_input : bool, default=True
 |          Allow to bypass several input checking.
 |          Don't use this parameter unless you know what you're doing.
 |      
 |      Returns
 |      -------
 |      X_leaves : array-like of shape (n_samples,)
 |          For each datapoint x in X, return the index of the leaf x
 |          ends up in. Leaves are numbered within
 |          ``[0; self.tree_.node_count)``, possibly with gaps in the
 |          numbering.
 |  
 |  cost_complexity_pruning_path(self, X, y, sample_weight=None)
 |      Compute the pruning path during Minimal Cost-Complexity Pruning.
 |      
 |      See :ref:`minimal_cost_complexity_pruning` for details on the pruning
 |      process.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The training input samples. Internally, it will be converted to
 |          ``dtype=np.float32`` and if a sparse matrix is provided
 |          to a sparse ``csc_matrix``.
 |      
 |      y : array-like of shape (n_samples,) or (n_samples, n_outputs)
 |          The target values (class labels) as integers or strings.
 |      
 |      sample_weight : array-like of shape (n_samples,), default=None
 |          Sample weights. If None, then samples are equally weighted. Splits
 |          that would create child nodes with net zero or negative weight are
 |          ignored while searching for a split in each node. Splits are also
 |          ignored if they would result in any single class carrying a
 |          negative weight in either child node.
 |      
 |      Returns
 |      -------
 |      ccp_path : :class:`~sklearn.utils.Bunch`
 |          Dictionary-like object, with the following attributes.
 |      
 |          ccp_alphas : ndarray
 |              Effective alphas of subtree during pruning.
 |      
 |          impurities : ndarray
 |              Sum of the impurities of the subtree leaves for the
 |              corresponding alpha value in ``ccp_alphas``.
 |  
 |  decision_path(self, X, check_input=True)
 |      Return the decision path in the tree.
 |      
 |      .. versionadded:: 0.18
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, it will be converted to
 |          ``dtype=np.float32`` and if a sparse matrix is provided
 |          to a sparse ``csr_matrix``.
 |      
 |      check_input : bool, default=True
 |          Allow to bypass several input checking.
 |          Don't use this parameter unless you know what you're doing.
 |      
 |      Returns
 |      -------
 |      indicator : sparse matrix of shape (n_samples, n_nodes)
 |          Return a node indicator CSR matrix where non zero elements
 |          indicates that the samples goes through the nodes.
 |  
 |  get_depth(self)
 |      Return the depth of the decision tree.
 |      
 |      The depth of a tree is the maximum distance between the root
 |      and any leaf.
 |      
 |      Returns
 |      -------
 |      self.tree_.max_depth : int
 |          The maximum depth of the tree.
 |  
 |  get_n_leaves(self)
 |      Return the number of leaves of the decision tree.
 |      
 |      Returns
 |      -------
 |      self.tree_.n_leaves : int
 |          Number of leaves.
 |  
 |  predict(self, X, check_input=True)
 |      Predict class or regression value for X.
 |      
 |      For a classification model, the predicted class for each sample in X is
 |      returned. For a regression model, the predicted value based on X is
 |      returned.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, it will be converted to
 |          ``dtype=np.float32`` and if a sparse matrix is provided
 |          to a sparse ``csr_matrix``.
 |      
 |      check_input : bool, default=True
 |          Allow to bypass several input checking.
 |          Don't use this parameter unless you know what you're doing.
 |      
 |      Returns
 |      -------
 |      y : array-like of shape (n_samples,) or (n_samples, n_outputs)
 |          The predicted classes, or the predict values.
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from BaseDecisionTree:
 |  
 |  feature_importances_
 |      Return the feature importances.
 |      
 |      The importance of a feature is computed as the (normalized) total
 |      reduction of the criterion brought by that feature.
 |      It is also known as the Gini importance.
 |      
 |      Warning: impurity-based feature importances can be misleading for
 |      high cardinality features (many unique values). See
 |      :func:`sklearn.inspection.permutation_importance` as an alternative.
 |      
 |      Returns
 |      -------
 |      feature_importances_ : ndarray of shape (n_features,)
 |          Normalized total reduction of criteria by feature
 |          (Gini importance).
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.BaseEstimator:
 |  
 |  __getstate__(self)
 |      Helper for pickle.
 |  
 |  __repr__(self, N_CHAR_MAX=700)
 |      Return repr(self).
 |  
 |  __setstate__(self, state)
 |  
 |  get_params(self, deep=True)
 |      Get parameters for this estimator.
 |      
 |      Parameters
 |      ----------
 |      deep : bool, default=True
 |          If True, will return the parameters for this estimator and
 |          contained subobjects that are estimators.
 |      
 |      Returns
 |      -------
 |      params : dict
 |          Parameter names mapped to their values.
 |  
 |  set_params(self, **params)
 |      Set the parameters of this estimator.
 |      
 |      The method works on simple estimators as well as on nested objects
 |      (such as :class:`~sklearn.pipeline.Pipeline`). The latter have
 |      parameters of the form ``<component>__<parameter>`` so that it's
 |      possible to update each component of a nested object.
 |      
 |      Parameters
 |      ----------
 |      **params : dict
 |          Estimator parameters.
 |      
 |      Returns
 |      -------
 |      self : estimator instance
 |          Estimator instance.
# fit the model
from sklearn.datasets import load_breast_cancer
dataset_bc = load_breast_cancer()
X_bc =  load_breast_cancer().data
y_bc =  load_breast_cancer().target
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(X_bc, y_bc, test_size=0.33, random_state=42)
dtc.fit(X_train_bc, y_train_bc)
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# train test scores
dtc.score(X_train_bc, y_train_bc), dtc.score(X_test_bc, y_test_bc)
(1.0, 0.9202127659574468)
# sketch the tree
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 10), facecolor='gray')

tree.plot_tree(dtc, filled=True, class_names=dataset_bc.target_names, feature_names=dataset_bc.feature_names, ax=ax);
_images/004e66a60f6eb8dc4a7290abd76d35b28aa371cb9dc6c844b95f5494817fafc1.png
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset_bc.feature_names, dtc.feature_importances_)
plt.xticks(rotation=90);
_images/435fdd2555cd0fee64c0329a97e69b6470408fb6c6586efb13cc2bb5c21f895b.png

Important Hyperparameter

max_depth : int, default=None

  • The maximum depth of the tree.

  • If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples.

# max_depth=2
dtc2 = DecisionTreeClassifier(max_depth=2)
dtc2.fit(X_train_bc, y_train_bc)
DecisionTreeClassifier(max_depth=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# train test scores
dtc2.score(X_train_bc, y_train_bc), dtc2.score(X_test_bc, y_test_bc)
(0.9448818897637795, 0.925531914893617)
from sklearn import tree
import matplotlib.pyplot as plt
fig, ax = plt.subplots(figsize=(10, 10), facecolor='gray')

tree.plot_tree(dtc2, filled=True, class_names=dataset_bc.target_names, feature_names=dataset_bc.feature_names, ax=ax);
_images/ba01fd08cdc8b9c1bd3aff18a4ff5ddce1f6dc6724799adf53eba405dac408df.png
# feature importances
dtc2.feature_importances_
array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.88963542, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.0410358 , 0.        , 0.06932878, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ])
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset_bc.feature_names, dtc2.feature_importances_)
plt.xticks(rotation=90);
_images/42309e656f0e5943692c36a146047a5d7ec6531f4a4ad35697f5c20590a941f4.png

Desicion Tree Regressor#

# instantiate the class into an object
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
# fit the model
dtr.fit(X_train , y_train )
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# scores
dtr.score(X_train , y_train ), dtr.score(X_test , y_test )
(1.0, 0.5911376198414031)
# sketch the tree
fig, ax = plt.subplots(figsize=(10, 10), facecolor='gray')

tree.plot_tree(dtr, filled=True, feature_names=dataset.feature_names, ax=ax);
Error in callback <function _draw_all_if_interactive at 0x1384a3d80> (for post_execute), with arguments args (),kwargs {}:
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
File ~/anaconda3/lib/python3.11/site-packages/matplotlib/pyplot.py:197, in _draw_all_if_interactive()
    195 def _draw_all_if_interactive() -> None:
    196     if matplotlib.is_interactive():
--> 197         draw_all()

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/_pylab_helpers.py:132, in Gcf.draw_all(cls, force)
    130 for manager in cls.get_all_fig_managers():
    131     if force or manager.canvas.figure.stale:
--> 132         manager.canvas.draw_idle()

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/backend_bases.py:1893, in FigureCanvasBase.draw_idle(self, *args, **kwargs)
   1891 if not self._is_idle_drawing:
   1892     with self._idle_draw_cntx():
-> 1893         self.draw(*args, **kwargs)

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/backends/backend_agg.py:388, in FigureCanvasAgg.draw(self)
    385 # Acquire a lock on the shared font cache.
    386 with (self.toolbar._wait_cursor_for_draw_cm() if self.toolbar
    387       else nullcontext()):
--> 388     self.figure.draw(self.renderer)
    389     # A GUI class may be need to update a window using this draw, so
    390     # don't forget to call the superclass.
    391     super().draw()

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:95, in _finalize_rasterization.<locals>.draw_wrapper(artist, renderer, *args, **kwargs)
     93 @wraps(draw)
     94 def draw_wrapper(artist, renderer, *args, **kwargs):
---> 95     result = draw(artist, renderer, *args, **kwargs)
     96     if renderer._rasterizing:
     97         renderer.stop_rasterizing()

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:72, in allow_rasterization.<locals>.draw_wrapper(artist, renderer)
     69     if artist.get_agg_filter() is not None:
     70         renderer.start_filter()
---> 72     return draw(artist, renderer)
     73 finally:
     74     if artist.get_agg_filter() is not None:

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/figure.py:3154, in Figure.draw(self, renderer)
   3151         # ValueError can occur when resizing a window.
   3153 self.patch.draw(renderer)
-> 3154 mimage._draw_list_compositing_images(
   3155     renderer, self, artists, self.suppressComposite)
   3157 for sfig in self.subfigs:
   3158     sfig.draw(renderer)

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/image.py:132, in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    130 if not_composite or not has_images:
    131     for a in artists:
--> 132         a.draw(renderer)
    133 else:
    134     # Composite any adjacent images together
    135     image_group = []

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:72, in allow_rasterization.<locals>.draw_wrapper(artist, renderer)
     69     if artist.get_agg_filter() is not None:
     70         renderer.start_filter()
---> 72     return draw(artist, renderer)
     73 finally:
     74     if artist.get_agg_filter() is not None:

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/axes/_base.py:3070, in _AxesBase.draw(self, renderer)
   3067 if artists_rasterized:
   3068     _draw_rasterized(self.figure, artists_rasterized, renderer)
-> 3070 mimage._draw_list_compositing_images(
   3071     renderer, self, artists, self.figure.suppressComposite)
   3073 renderer.close_group('axes')
   3074 self.stale = False

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/image.py:132, in _draw_list_compositing_images(renderer, parent, artists, suppress_composite)
    130 if not_composite or not has_images:
    131     for a in artists:
--> 132         a.draw(renderer)
    133 else:
    134     # Composite any adjacent images together
    135     image_group = []

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:72, in allow_rasterization.<locals>.draw_wrapper(artist, renderer)
     69     if artist.get_agg_filter() is not None:
     70         renderer.start_filter()
---> 72     return draw(artist, renderer)
     73 finally:
     74     if artist.get_agg_filter() is not None:

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/text.py:1991, in Annotation.draw(self, renderer)
   1988     self.arrow_patch.draw(renderer)
   1989 # Draw text, including FancyBboxPatch, after FancyArrowPatch.
   1990 # Otherwise, a wedge arrowstyle can land partly on top of the Bbox.
-> 1991 Text.draw(self, renderer)

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/artist.py:72, in allow_rasterization.<locals>.draw_wrapper(artist, renderer)
     69     if artist.get_agg_filter() is not None:
     70         renderer.start_filter()
---> 72     return draw(artist, renderer)
     73 finally:
     74     if artist.get_agg_filter() is not None:

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/text.py:797, in Text.draw(self, renderer)
    793             textrenderer.draw_tex(gc, x, y, clean_line,
    794                                   self._fontproperties, angle,
    795                                   mtext=mtext)
    796         else:
--> 797             textrenderer.draw_text(gc, x, y, clean_line,
    798                                    self._fontproperties, angle,
    799                                    ismath=ismath, mtext=mtext)
    801 gc.restore()
    802 renderer.close_group('text')

File ~/anaconda3/lib/python3.11/site-packages/matplotlib/backends/backend_agg.py:194, in RendererAgg.draw_text(self, gc, x, y, s, prop, angle, ismath, mtext)
    191 font = self._prepare_font(prop)
    192 # We pass '0' for angle here, since it will be rotated (in raster
    193 # space) in the following call to draw_text_image).
--> 194 font.set_text(s, 0, flags=get_hinting_flag())
    195 font.draw_glyphs_to_bitmap(
    196     antialiased=gc.get_antialiased())
    197 d = font.get_descent() / 64.0

KeyboardInterrupt: 
# feature importances
dtr.feature_importances_
array([0.52237857, 0.06006469, 0.05320339, 0.02897168, 0.03106369,
       0.13288264, 0.08741881, 0.08401653])
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset.feature_names, dtr.feature_importances_)
plt.xticks(rotation=90);
_images/33968e8ce5a876161484a2b8be21a2b2b883e8e6fb38fccbcff254971d74fef2.png
# max_depth=2
dtr2 = DecisionTreeRegressor(max_depth=2)
# fit the model
dtr2.fit(X_train , y_train )
DecisionTreeRegressor(max_depth=2)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# skecth the tree
fig, ax = plt.subplots(figsize=(10, 10), facecolor='gray')
tree.plot_tree(dtr2, filled=True, feature_names=dataset.feature_names, ax=ax);
_images/e9a012c277e2dfed143a8dd6008dd2d95f78b9a56b0edfce7c8aa7fd98325530.png
# feature importances
dtr2.feature_importances_
array([1., 0., 0., 0., 0., 0., 0., 0.])
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset.feature_names, dtr2.feature_importances_)
plt.xticks(rotation=90);
_images/9ec19c636f5faa009a10a44b065b89e28ad6b741361571253d786881b0b24aea.png

Random Forest#

  • ensemble of decision trees for classification and regression

  • usually trained with the “bagging” method

    • train with randomly chosen training dataset

  • builds multiple decision trees and merges them together to get a more accurate and stable prediction adds additional randomness to the model, while growing the trees.

  • Final Desicion:

    • Soft voting for classification

    • Average for regression

Slide12.JPG

Random Forest Classifier#

# instantiate the class into an object
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
# help
help(rfc)
Help on RandomForestClassifier in module sklearn.ensemble._forest object:

class RandomForestClassifier(ForestClassifier)
 |  RandomForestClassifier(n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |  
 |  A random forest classifier.
 |  
 |  A random forest is a meta estimator that fits a number of decision tree
 |  classifiers on various sub-samples of the dataset and uses averaging to
 |  improve the predictive accuracy and control over-fitting.
 |  The sub-sample size is controlled with the `max_samples` parameter if
 |  `bootstrap=True` (default), otherwise the whole dataset is used to build
 |  each tree.
 |  
 |  Read more in the :ref:`User Guide <forest>`.
 |  
 |  Parameters
 |  ----------
 |  n_estimators : int, default=100
 |      The number of trees in the forest.
 |  
 |      .. versionchanged:: 0.22
 |         The default value of ``n_estimators`` changed from 10 to 100
 |         in 0.22.
 |  
 |  criterion : {"gini", "entropy", "log_loss"}, default="gini"
 |      The function to measure the quality of a split. Supported criteria are
 |      "gini" for the Gini impurity and "log_loss" and "entropy" both for the
 |      Shannon information gain, see :ref:`tree_mathematical_formulation`.
 |      Note: This parameter is tree-specific.
 |  
 |  max_depth : int, default=None
 |      The maximum depth of the tree. If None, then nodes are expanded until
 |      all leaves are pure or until all leaves contain less than
 |      min_samples_split samples.
 |  
 |  min_samples_split : int or float, default=2
 |      The minimum number of samples required to split an internal node:
 |  
 |      - If int, then consider `min_samples_split` as the minimum number.
 |      - If float, then `min_samples_split` is a fraction and
 |        `ceil(min_samples_split * n_samples)` are the minimum
 |        number of samples for each split.
 |  
 |      .. versionchanged:: 0.18
 |         Added float values for fractions.
 |  
 |  min_samples_leaf : int or float, default=1
 |      The minimum number of samples required to be at a leaf node.
 |      A split point at any depth will only be considered if it leaves at
 |      least ``min_samples_leaf`` training samples in each of the left and
 |      right branches.  This may have the effect of smoothing the model,
 |      especially in regression.
 |  
 |      - If int, then consider `min_samples_leaf` as the minimum number.
 |      - If float, then `min_samples_leaf` is a fraction and
 |        `ceil(min_samples_leaf * n_samples)` are the minimum
 |        number of samples for each node.
 |  
 |      .. versionchanged:: 0.18
 |         Added float values for fractions.
 |  
 |  min_weight_fraction_leaf : float, default=0.0
 |      The minimum weighted fraction of the sum total of weights (of all
 |      the input samples) required to be at a leaf node. Samples have
 |      equal weight when sample_weight is not provided.
 |  
 |  max_features : {"sqrt", "log2", None}, int or float, default="sqrt"
 |      The number of features to consider when looking for the best split:
 |  
 |      - If int, then consider `max_features` features at each split.
 |      - If float, then `max_features` is a fraction and
 |        `max(1, int(max_features * n_features_in_))` features are considered at each
 |        split.
 |      - If "auto", then `max_features=sqrt(n_features)`.
 |      - If "sqrt", then `max_features=sqrt(n_features)`.
 |      - If "log2", then `max_features=log2(n_features)`.
 |      - If None, then `max_features=n_features`.
 |  
 |      .. versionchanged:: 1.1
 |          The default of `max_features` changed from `"auto"` to `"sqrt"`.
 |  
 |      .. deprecated:: 1.1
 |          The `"auto"` option was deprecated in 1.1 and will be removed
 |          in 1.3.
 |  
 |      Note: the search for a split does not stop until at least one
 |      valid partition of the node samples is found, even if it requires to
 |      effectively inspect more than ``max_features`` features.
 |  
 |  max_leaf_nodes : int, default=None
 |      Grow trees with ``max_leaf_nodes`` in best-first fashion.
 |      Best nodes are defined as relative reduction in impurity.
 |      If None then unlimited number of leaf nodes.
 |  
 |  min_impurity_decrease : float, default=0.0
 |      A node will be split if this split induces a decrease of the impurity
 |      greater than or equal to this value.
 |  
 |      The weighted impurity decrease equation is the following::
 |  
 |          N_t / N * (impurity - N_t_R / N_t * right_impurity
 |                              - N_t_L / N_t * left_impurity)
 |  
 |      where ``N`` is the total number of samples, ``N_t`` is the number of
 |      samples at the current node, ``N_t_L`` is the number of samples in the
 |      left child, and ``N_t_R`` is the number of samples in the right child.
 |  
 |      ``N``, ``N_t``, ``N_t_R`` and ``N_t_L`` all refer to the weighted sum,
 |      if ``sample_weight`` is passed.
 |  
 |      .. versionadded:: 0.19
 |  
 |  bootstrap : bool, default=True
 |      Whether bootstrap samples are used when building trees. If False, the
 |      whole dataset is used to build each tree.
 |  
 |  oob_score : bool, default=False
 |      Whether to use out-of-bag samples to estimate the generalization score.
 |      Only available if bootstrap=True.
 |  
 |  n_jobs : int, default=None
 |      The number of jobs to run in parallel. :meth:`fit`, :meth:`predict`,
 |      :meth:`decision_path` and :meth:`apply` are all parallelized over the
 |      trees. ``None`` means 1 unless in a :obj:`joblib.parallel_backend`
 |      context. ``-1`` means using all processors. See :term:`Glossary
 |      <n_jobs>` for more details.
 |  
 |  random_state : int, RandomState instance or None, default=None
 |      Controls both the randomness of the bootstrapping of the samples used
 |      when building trees (if ``bootstrap=True``) and the sampling of the
 |      features to consider when looking for the best split at each node
 |      (if ``max_features < n_features``).
 |      See :term:`Glossary <random_state>` for details.
 |  
 |  verbose : int, default=0
 |      Controls the verbosity when fitting and predicting.
 |  
 |  warm_start : bool, default=False
 |      When set to ``True``, reuse the solution of the previous call to fit
 |      and add more estimators to the ensemble, otherwise, just fit a whole
 |      new forest. See :term:`Glossary <warm_start>` and
 |      :ref:`gradient_boosting_warm_start` for details.
 |  
 |  class_weight : {"balanced", "balanced_subsample"}, dict or list of dicts,             default=None
 |      Weights associated with classes in the form ``{class_label: weight}``.
 |      If not given, all classes are supposed to have weight one. For
 |      multi-output problems, a list of dicts can be provided in the same
 |      order as the columns of y.
 |  
 |      Note that for multioutput (including multilabel) weights should be
 |      defined for each class of every column in its own dict. For example,
 |      for four-class multilabel classification weights should be
 |      [{0: 1, 1: 1}, {0: 1, 1: 5}, {0: 1, 1: 1}, {0: 1, 1: 1}] instead of
 |      [{1:1}, {2:5}, {3:1}, {4:1}].
 |  
 |      The "balanced" mode uses the values of y to automatically adjust
 |      weights inversely proportional to class frequencies in the input data
 |      as ``n_samples / (n_classes * np.bincount(y))``
 |  
 |      The "balanced_subsample" mode is the same as "balanced" except that
 |      weights are computed based on the bootstrap sample for every tree
 |      grown.
 |  
 |      For multi-output, the weights of each column of y will be multiplied.
 |  
 |      Note that these weights will be multiplied with sample_weight (passed
 |      through the fit method) if sample_weight is specified.
 |  
 |  ccp_alpha : non-negative float, default=0.0
 |      Complexity parameter used for Minimal Cost-Complexity Pruning. The
 |      subtree with the largest cost complexity that is smaller than
 |      ``ccp_alpha`` will be chosen. By default, no pruning is performed. See
 |      :ref:`minimal_cost_complexity_pruning` for details.
 |  
 |      .. versionadded:: 0.22
 |  
 |  max_samples : int or float, default=None
 |      If bootstrap is True, the number of samples to draw from X
 |      to train each base estimator.
 |  
 |      - If None (default), then draw `X.shape[0]` samples.
 |      - If int, then draw `max_samples` samples.
 |      - If float, then draw `max_samples * X.shape[0]` samples. Thus,
 |        `max_samples` should be in the interval `(0.0, 1.0]`.
 |  
 |      .. versionadded:: 0.22
 |  
 |  Attributes
 |  ----------
 |  estimator_ : :class:`~sklearn.tree.DecisionTreeClassifier`
 |      The child estimator template used to create the collection of fitted
 |      sub-estimators.
 |  
 |      .. versionadded:: 1.2
 |         `base_estimator_` was renamed to `estimator_`.
 |  
 |  base_estimator_ : DecisionTreeClassifier
 |      The child estimator template used to create the collection of fitted
 |      sub-estimators.
 |  
 |      .. deprecated:: 1.2
 |          `base_estimator_` is deprecated and will be removed in 1.4.
 |          Use `estimator_` instead.
 |  
 |  estimators_ : list of DecisionTreeClassifier
 |      The collection of fitted sub-estimators.
 |  
 |  classes_ : ndarray of shape (n_classes,) or a list of such arrays
 |      The classes labels (single output problem), or a list of arrays of
 |      class labels (multi-output problem).
 |  
 |  n_classes_ : int or list
 |      The number of classes (single output problem), or a list containing the
 |      number of classes for each output (multi-output problem).
 |  
 |  n_features_in_ : int
 |      Number of features seen during :term:`fit`.
 |  
 |      .. versionadded:: 0.24
 |  
 |  feature_names_in_ : ndarray of shape (`n_features_in_`,)
 |      Names of features seen during :term:`fit`. Defined only when `X`
 |      has feature names that are all strings.
 |  
 |      .. versionadded:: 1.0
 |  
 |  n_outputs_ : int
 |      The number of outputs when ``fit`` is performed.
 |  
 |  feature_importances_ : ndarray of shape (n_features,)
 |      The impurity-based feature importances.
 |      The higher, the more important the feature.
 |      The importance of a feature is computed as the (normalized)
 |      total reduction of the criterion brought by that feature.  It is also
 |      known as the Gini importance.
 |  
 |      Warning: impurity-based feature importances can be misleading for
 |      high cardinality features (many unique values). See
 |      :func:`sklearn.inspection.permutation_importance` as an alternative.
 |  
 |  oob_score_ : float
 |      Score of the training dataset obtained using an out-of-bag estimate.
 |      This attribute exists only when ``oob_score`` is True.
 |  
 |  oob_decision_function_ : ndarray of shape (n_samples, n_classes) or             (n_samples, n_classes, n_outputs)
 |      Decision function computed with out-of-bag estimate on the training
 |      set. If n_estimators is small it might be possible that a data point
 |      was never left out during the bootstrap. In this case,
 |      `oob_decision_function_` might contain NaN. This attribute exists
 |      only when ``oob_score`` is True.
 |  
 |  See Also
 |  --------
 |  sklearn.tree.DecisionTreeClassifier : A decision tree classifier.
 |  sklearn.ensemble.ExtraTreesClassifier : Ensemble of extremely randomized
 |      tree classifiers.
 |  
 |  Notes
 |  -----
 |  The default values for the parameters controlling the size of the trees
 |  (e.g. ``max_depth``, ``min_samples_leaf``, etc.) lead to fully grown and
 |  unpruned trees which can potentially be very large on some data sets. To
 |  reduce memory consumption, the complexity and size of the trees should be
 |  controlled by setting those parameter values.
 |  
 |  The features are always randomly permuted at each split. Therefore,
 |  the best found split may vary, even with the same training data,
 |  ``max_features=n_features`` and ``bootstrap=False``, if the improvement
 |  of the criterion is identical for several splits enumerated during the
 |  search of the best split. To obtain a deterministic behaviour during
 |  fitting, ``random_state`` has to be fixed.
 |  
 |  References
 |  ----------
 |  .. [1] L. Breiman, "Random Forests", Machine Learning, 45(1), 5-32, 2001.
 |  
 |  Examples
 |  --------
 |  >>> from sklearn.ensemble import RandomForestClassifier
 |  >>> from sklearn.datasets import make_classification
 |  >>> X, y = make_classification(n_samples=1000, n_features=4,
 |  ...                            n_informative=2, n_redundant=0,
 |  ...                            random_state=0, shuffle=False)
 |  >>> clf = RandomForestClassifier(max_depth=2, random_state=0)
 |  >>> clf.fit(X, y)
 |  RandomForestClassifier(...)
 |  >>> print(clf.predict([[0, 0, 0, 0]]))
 |  [1]
 |  
 |  Method resolution order:
 |      RandomForestClassifier
 |      ForestClassifier
 |      sklearn.base.ClassifierMixin
 |      BaseForest
 |      sklearn.base.MultiOutputMixin
 |      sklearn.ensemble._base.BaseEnsemble
 |      sklearn.base.MetaEstimatorMixin
 |      sklearn.base.BaseEstimator
 |      builtins.object
 |  
 |  Methods defined here:
 |  
 |  __init__(self, n_estimators=100, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='sqrt', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)
 |      Initialize self.  See help(type(self)) for accurate signature.
 |  
 |  ----------------------------------------------------------------------
 |  Data and other attributes defined here:
 |  
 |  __abstractmethods__ = frozenset()
 |  
 |  __annotations__ = {'_parameter_constraints': <class 'dict'>}
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from ForestClassifier:
 |  
 |  predict(self, X)
 |      Predict class for X.
 |      
 |      The predicted class of an input sample is a vote by the trees in
 |      the forest, weighted by their probability estimates. That is,
 |      the predicted class is the one with highest mean probability
 |      estimate across the trees.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      y : ndarray of shape (n_samples,) or (n_samples, n_outputs)
 |          The predicted classes.
 |  
 |  predict_log_proba(self, X)
 |      Predict class log-probabilities for X.
 |      
 |      The predicted class log-probabilities of an input sample is computed as
 |      the log of the mean predicted class probabilities of the trees in the
 |      forest.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      p : ndarray of shape (n_samples, n_classes), or a list of such arrays
 |          The class probabilities of the input samples. The order of the
 |          classes corresponds to that in the attribute :term:`classes_`.
 |  
 |  predict_proba(self, X)
 |      Predict class probabilities for X.
 |      
 |      The predicted class probabilities of an input sample are computed as
 |      the mean predicted class probabilities of the trees in the forest.
 |      The class probability of a single tree is the fraction of samples of
 |      the same class in a leaf.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      p : ndarray of shape (n_samples, n_classes), or a list of such arrays
 |          The class probabilities of the input samples. The order of the
 |          classes corresponds to that in the attribute :term:`classes_`.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.ClassifierMixin:
 |  
 |  score(self, X, y, sample_weight=None)
 |      Return the mean accuracy on the given test data and labels.
 |      
 |      In multi-label classification, this is the subset accuracy
 |      which is a harsh metric since you require for each sample that
 |      each label set be correctly predicted.
 |      
 |      Parameters
 |      ----------
 |      X : array-like of shape (n_samples, n_features)
 |          Test samples.
 |      
 |      y : array-like of shape (n_samples,) or (n_samples, n_outputs)
 |          True labels for `X`.
 |      
 |      sample_weight : array-like of shape (n_samples,), default=None
 |          Sample weights.
 |      
 |      Returns
 |      -------
 |      score : float
 |          Mean accuracy of ``self.predict(X)`` w.r.t. `y`.
 |  
 |  ----------------------------------------------------------------------
 |  Data descriptors inherited from sklearn.base.ClassifierMixin:
 |  
 |  __dict__
 |      dictionary for instance variables
 |  
 |  __weakref__
 |      list of weak references to the object
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from BaseForest:
 |  
 |  apply(self, X)
 |      Apply trees in the forest to X, return leaf indices.
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      X_leaves : ndarray of shape (n_samples, n_estimators)
 |          For each datapoint x in X and for each tree in the forest,
 |          return the index of the leaf x ends up in.
 |  
 |  decision_path(self, X)
 |      Return the decision path in the forest.
 |      
 |      .. versionadded:: 0.18
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The input samples. Internally, its dtype will be converted to
 |          ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csr_matrix``.
 |      
 |      Returns
 |      -------
 |      indicator : sparse matrix of shape (n_samples, n_nodes)
 |          Return a node indicator matrix where non zero elements indicates
 |          that the samples goes through the nodes. The matrix is of CSR
 |          format.
 |      
 |      n_nodes_ptr : ndarray of shape (n_estimators + 1,)
 |          The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
 |          gives the indicator value for the i-th estimator.
 |  
 |  fit(self, X, y, sample_weight=None)
 |      Build a forest of trees from the training set (X, y).
 |      
 |      Parameters
 |      ----------
 |      X : {array-like, sparse matrix} of shape (n_samples, n_features)
 |          The training input samples. Internally, its dtype will be converted
 |          to ``dtype=np.float32``. If a sparse matrix is provided, it will be
 |          converted into a sparse ``csc_matrix``.
 |      
 |      y : array-like of shape (n_samples,) or (n_samples, n_outputs)
 |          The target values (class labels in classification, real numbers in
 |          regression).
 |      
 |      sample_weight : array-like of shape (n_samples,), default=None
 |          Sample weights. If None, then samples are equally weighted. Splits
 |          that would create child nodes with net zero or negative weight are
 |          ignored while searching for a split in each node. In the case of
 |          classification, splits are also ignored if they would result in any
 |          single class carrying a negative weight in either child node.
 |      
 |      Returns
 |      -------
 |      self : object
 |          Fitted estimator.
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from BaseForest:
 |  
 |  feature_importances_
 |      The impurity-based feature importances.
 |      
 |      The higher, the more important the feature.
 |      The importance of a feature is computed as the (normalized)
 |      total reduction of the criterion brought by that feature.  It is also
 |      known as the Gini importance.
 |      
 |      Warning: impurity-based feature importances can be misleading for
 |      high cardinality features (many unique values). See
 |      :func:`sklearn.inspection.permutation_importance` as an alternative.
 |      
 |      Returns
 |      -------
 |      feature_importances_ : ndarray of shape (n_features,)
 |          The values of this array sum to 1, unless all trees are single node
 |          trees consisting of only the root node, in which case it will be an
 |          array of zeros.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.ensemble._base.BaseEnsemble:
 |  
 |  __getitem__(self, index)
 |      Return the index'th estimator in the ensemble.
 |  
 |  __iter__(self)
 |      Return iterator over estimators in the ensemble.
 |  
 |  __len__(self)
 |      Return the number of estimators in the ensemble.
 |  
 |  ----------------------------------------------------------------------
 |  Readonly properties inherited from sklearn.ensemble._base.BaseEnsemble:
 |  
 |  base_estimator_
 |      Estimator used to grow the ensemble.
 |  
 |  ----------------------------------------------------------------------
 |  Methods inherited from sklearn.base.BaseEstimator:
 |  
 |  __getstate__(self)
 |      Helper for pickle.
 |  
 |  __repr__(self, N_CHAR_MAX=700)
 |      Return repr(self).
 |  
 |  __setstate__(self, state)
 |  
 |  get_params(self, deep=True)
 |      Get parameters for this estimator.
 |      
 |      Parameters
 |      ----------
 |      deep : bool, default=True
 |          If True, will return the parameters for this estimator and
 |          contained subobjects that are estimators.
 |      
 |      Returns
 |      -------
 |      params : dict
 |          Parameter names mapped to their values.
 |  
 |  set_params(self, **params)
 |      Set the parameters of this estimator.
 |      
 |      The method works on simple estimators as well as on nested objects
 |      (such as :class:`~sklearn.pipeline.Pipeline`). The latter have
 |      parameters of the form ``<component>__<parameter>`` so that it's
 |      possible to update each component of a nested object.
 |      
 |      Parameters
 |      ----------
 |      **params : dict
 |          Estimator parameters.
 |      
 |      Returns
 |      -------
 |      self : estimator instance
 |          Estimator instance.
# fit the model
rfc.fit(X_train_bc, y_train_bc)
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# train test scores
rfc.score(X_train_bc, y_train_bc), rfc.score(X_test_bc, y_test_bc)
(1.0, 0.9521276595744681)

Important Hyperparameter

n_estimators : integer, optional (default=100)

  • The number of trees in the forest.

# n_estimator=5
rfc5 = RandomForestClassifier(n_estimators=5)
# fit the model
rfc5.fit(X_train_bc, y_train_bc)
RandomForestClassifier(n_estimators=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# train test scores
rfc5.score(X_train_bc, y_train_bc), rfc5.score(X_test_bc, y_test_bc)
(1.0, 0.9414893617021277)
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset_bc.feature_names, rfc5.feature_importances_)
plt.xticks(rotation=90);
_images/9c46e028d121df98f6c717e4967e34b6c027b6be9996f761e2fd52ebf3cb0c44.png

Random Forest Regressor#

# instantiate the class into an object
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()
# fit the model
rfr.fit(X_train , y_train )
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# train test scores
rfr.score(X_train , y_train ), rfr.score(X_test , y_test )
(0.9722250046056299, 0.8014232972663238)
# n_estimator=5
rfr5 = RandomForestRegressor(n_estimators=5)
# fit the model
rfr5.fit(X_train , y_train )
RandomForestRegressor(n_estimators=5)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
# train test scores
rfr5.score(X_train , y_train ), rfr5.score(X_test , y_test )
(0.9491468901205052, 0.7602286543954643)
# coefficients in a bar graph
plt.figure(figsize=(10,5))
plt.bar(dataset.feature_names, rfr5.feature_importances_)
plt.xticks(rotation=90);
_images/192171f0254b13c88f352cebe4afcfa8f0957cae828180a470bda393845c1c93.png

XGboost#

  • tree based

  • improved version of Random Forest

from xgboost import XGBRegressor
xgb = XGBRegressor()
xgb.fit(X_train, y_train)
XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=None, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=None, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=None, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=None, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.

Logistic Regression#

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
X_bc =  load_breast_cancer().data
y_bc =  load_breast_cancer().target
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(X_bc, y_bc, test_size=0.33, random_state=42)
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression(max_iter=10000)
log_reg.fit(X_train_bc, y_train_bc)
LogisticRegression(max_iter=10000)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
log_reg.score(X_train_bc, y_train_bc)
0.9606299212598425
log_reg.score(X_test_bc, y_test_bc)
0.9680851063829787
log_reg.predict(X_test_bc[:5])
array([1, 0, 0, 1, 1])
y_test_bc[:5]
array([1, 0, 0, 1, 1])
log_reg.predict_proba(X_test_bc[:5]).round(3)
array([[0.14 , 0.86 ],
       [1.   , 0.   ],
       [0.998, 0.002],
       [0.002, 0.998],
       [0.   , 1.   ]])